Happy Birthday JESSE

Alttext

Alttext

Alttext2 ! [Alt text] (/Users/sophiang/Documents/am.jpeg)/am.jpeg “am”)

library(genius)

# Get songlyrics using Genius lyrics package
arc_mon_am <- genius_album(artist = "Arctic Monkeys", album = "AM")
## Joining, by = c("track_title", "track_n", "track_url")

Sentiment Analysis

library(tidyverse)
## ── Attaching packages ───────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.4
## ✓ tidyr   1.0.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ──────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidytext)
library(topicmodels)

tidy_monkey <- arc_mon_am %>%
  unnest_tokens(word, lyric) %>%
  anti_join(stop_words) %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup() %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()  
## Joining, by = "word"
## Joining, by = "word"
## Selecting by n
tidy_monkey

Sentiment Analysis II

library(textdata)

get_sentiments("nrc")
## # A tibble: 13,901 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # … with 13,891 more rows
tidy_monkey_2 <- arc_mon_am %>%
  unnest_tokens(word, lyric) %>%
  anti_join(stop_words) %>%
  inner_join(get_sentiments("nrc")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup() %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()  
## Joining, by = "word"
## Joining, by = "word"
## Selecting by n
tidy_monkey_2

Tidying the Data with tm package

library(tm)

# Create raw corpus from genius lyrics 
corpus_raw <- Corpus(VectorSource(arc_mon_am$lyric))

# Transform everything to lowercase
corpus <- tm_map(corpus_raw,content_transformer(tolower))

# Strip whitespace
corpus <- tm_map(corpus, stripWhitespace)

# Remove punctuation
corpus <- tm_map(corpus, removePunctuation) 

# Remove stopwords
corpus <- tm_map(corpus, removeWords, stopwords("english"))

# Stem the document
corpus <- tm_map(corpus, stemDocument)

# Create document term matrix
dtm <- DocumentTermMatrix(corpus)

# Tidy dtm 
corpus_tidy <- tidy(dtm)
  corpus_tidy %>% 
  bind_tf_idf(term, document, count) %>% 
  arrange(desc(tf_idf))
## # A tibble: 1,574 x 6
##    document term    count    tf   idf tf_idf
##    <chr>    <chr>   <dbl> <dbl> <dbl>  <dbl>
##  1 309      bumpin      1     1  6.19   6.19
##  2 350      forev       1     1  5.09   5.09
##  3 15       youv        1     1  4.80   4.80
##  4 34       youv        1     1  4.80   4.80
##  5 411      team        1     1  4.80   4.80
##  6 415      team        1     1  4.80   4.80
##  7 422      team        1     1  4.80   4.80
##  8 426      team        1     1  4.80   4.80
##  9 17       mayb        1     1  4.58   4.58
## 10 182      shoowop     4     1  4.58   4.58
## # … with 1,564 more rows

Latent Dirichlet Allocation (LDA)

# Deletes rows with zero entry because each row needs to contain at least one non-zero entry
raw.sum <- apply(dtm, 1, FUN=sum)
dtm <- dtm[raw.sum!=0,]

# LDA 
output <- LDA(dtm, k = 3, control = list(seed = 1234))
beta <- tidy(output, matrix = "beta")
filter(beta, topic==1)%>% arrange(desc(beta))
## # A tibble: 560 x 3
##    topic term     beta
##    <int> <chr>   <dbl>
##  1     1 wanna  0.0439
##  2     1 your   0.0233
##  3     1 come   0.0213
##  4     1 get    0.0210
##  5     1 make   0.0209
##  6     1 just   0.0201
##  7     1 like   0.0192
##  8     1 road   0.0183
##  9     1 mine   0.0180
## 10     1 number 0.0161
## # … with 550 more rows
filter(beta, topic==2)%>% arrange(desc(beta))
## # A tibble: 560 x 3
##    topic term      beta
##    <int> <chr>    <dbl>
##  1     2 wanna   0.0351
##  2     2 snap    0.0234
##  3     2 feel    0.0181
##  4     2 get     0.0175
##  5     2 come    0.0170
##  6     2 shoowop 0.0161
##  7     2 mine    0.0158
##  8     2 just    0.0155
##  9     2 thought 0.0127
## 10     2 ever    0.0114
## # … with 550 more rows
filter(beta, topic==3)%>% arrange(desc(beta))
## # A tibble: 560 x 3
##    topic term      beta
##    <int> <chr>    <dbl>
##  1     3 ooh     0.0463
##  2     3 one     0.0225
##  3     3 come    0.0207
##  4     3 babi    0.0166
##  5     3 night   0.0155
##  6     3 shoowop 0.0154
##  7     3 know    0.0145
##  8     3 oooh    0.0144
##  9     3 there   0.0132
## 10     3 like    0.0128
## # … with 550 more rows
round(head(posterior(output, dtm)$topics), digits = 3)
##       1     2     3
## 1 0.338 0.330 0.332
## 2 0.328 0.344 0.328
## 3 0.333 0.346 0.321
## 4 0.333 0.340 0.327
## 5 0.337 0.331 0.333
## 6 0.321 0.323 0.356
# Use dplyr’s top_n() to find the 10 terms that are most common within each of the 3 topics
monkey_top_terms <- beta %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

# Create ggplot
g_monkey_top_terms <- monkey_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

g_monkey_top_terms

In Defense of Wordclouds

library(wordcloud)
## Loading required package: RColorBrewer
# Wordcloud for three studio albums
monkey_cloud <- wordcloud(corpus, max.words = 70, random.order = FALSE, ordered.clouds = TRUE)